{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix, hstack\n",
    "\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.preprocessing import LabelBinarizer\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "import lightgbm as lgb\n",
    "import pandas as pd\n",
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv(\"./input/train_first.csv\")\n",
    "test = pd.read_csv(\"./input/predict_first.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = pd.concat([train, test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Id</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>方便</td>\n",
       "      <td>3b7f3f2e-886f-3a68-a810-2c37cfd728d3</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...</td>\n",
       "      <td>88914409-bd13-3d47-b5a2-691177dde8fd</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>赞</td>\n",
       "      <td>bf13ec92-6079-3451-ade3-88020cb0dcb5</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>唯一糟点</td>\n",
       "      <td>489c3d94-9c44-3cf2-949c-1b507c374c69</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>周未周边游</td>\n",
       "      <td>285bba78-16a3-3c1d-b648-baa483883ee3</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...</td>\n",
       "      <td>e7801d96-73d0-35c4-9e00-cc15caaa384a</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>绍兴护城河夜游</td>\n",
       "      <td>973afeca-7530-3f56-b7f5-bef36d889025</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>感觉还不错，作为一日游不错的选择～</td>\n",
       "      <td>cd91dc2f-2331-3c73-bc8d-da027337270d</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>有趣hai xing</td>\n",
       "      <td>7ce97eca-63a8-30a1-9687-6796f34606f1</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>荡气回肠，10年去的，居然没有留下来照片，必然要再去！&lt;br /&gt;n</td>\n",
       "      <td>25e21097-bd41-3589-b12c-62bc7b04eb6d</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！</td>\n",
       "      <td>98e78de7-d5d3-3b30-90d4-a63a6107d532</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...</td>\n",
       "      <td>26334fc8-a4f1-3dc3-adb6-76b99d75cdf9</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看</td>\n",
       "      <td>7f4d6d59-f732-3125-8e7d-8bd64c891b94</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>性价比超高</td>\n",
       "      <td>61522e3c-5d2a-3088-b60d-159dbc2976ce</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵</td>\n",
       "      <td>37e57244-8d7e-3a1d-8f0f-8b811afb4a6a</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看</td>\n",
       "      <td>81502f08-b884-38b8-8169-7de7a0680a82</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>迪士尼</td>\n",
       "      <td>533a667c-d6ba-313d-bc29-588b992789e0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>亲子游</td>\n",
       "      <td>041e4056-62f2-3f25-8e3e-8f57f66cb3d8</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。</td>\n",
       "      <td>988f2319-3292-305a-aaaf-ba18bc397e5a</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。</td>\n",
       "      <td>c3d7dd21-79ef-3ff2-b90d-14631e9a30b4</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>很值得去的地方</td>\n",
       "      <td>4b12c7b9-059f-3016-a954-3849b0456ce4</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>第一次必到景点</td>\n",
       "      <td>5ba1fa45-4c97-3afe-9dd6-3efea9c73a94</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>好歹也是长城</td>\n",
       "      <td>f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...</td>\n",
       "      <td>9363fc36-92a7-371f-8d5f-5dd71b565455</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！</td>\n",
       "      <td>a4dc34f1-6a97-3b86-829c-466cdaa86bf2</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29970</th>\n",
       "      <td>景色很美，原谅我的语言苍白，池水很清，可能是折射也可能是有藻类，整个池水透绿，空气很清新，忍...</td>\n",
       "      <td>f0de95d2-ef2b-3361-a6df-3f4392d74e93</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29971</th>\n",
       "      <td>人好多，人好多，人好多呀！</td>\n",
       "      <td>d0a33ded-fed9-3d92-9c51-c6512d71a93d</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29972</th>\n",
       "      <td>1.西塘管老太臭豆腐，这个是西塘最著名的小吃了，山寨冒牌遍布西塘的角角落落，而且味道也参...</td>\n",
       "      <td>b00cd736-093a-3d1e-b750-41283e37e09b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29973</th>\n",
       "      <td>798艺术中心 来朝圣了 到处都是拍照的人，到处都是奇装异服，到处都是行为艺术，到处都是各种...</td>\n",
       "      <td>8be4f87a-c76b-3e65-8ecc-a25f209824da</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29974</th>\n",
       "      <td>心诚则灵，很多拜佛的地方都是有信仰的人才去的</td>\n",
       "      <td>9079f642-7458-3c6d-9098-c4877cc81347</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29975</th>\n",
       "      <td>十一月初来的，风很大但是不太冷，坐滑车来回，也不错，长城非常壮观，有时候爬的很累了，但是一回...</td>\n",
       "      <td>f0d47b46-9391-3d0c-b068-8753e4edd583</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29976</th>\n",
       "      <td>总体就是一般般吧，不会特别推荐</td>\n",
       "      <td>8e9afa16-33ae-3f51-8156-1e9e5337605c</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29977</th>\n",
       "      <td>大大的雪场，绚丽多彩的冰灯</td>\n",
       "      <td>3a19d80a-1cdf-3616-9707-6f8a72b4513b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29978</th>\n",
       "      <td>这是个代表性的建筑，代表性的地点。</td>\n",
       "      <td>73081902-03a0-3f36-b562-900d6fd81f6c</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29979</th>\n",
       "      <td>坐地铁直奔王府井，因为急找住的地方把背包放下，就没溜达王府井</td>\n",
       "      <td>7fed708a-e036-36cb-98a4-f8de9fbbf3db</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29980</th>\n",
       "      <td>西双版纳热带植物园挺大的，这里很多热带植物！</td>\n",
       "      <td>cdb9eed1-54fb-3454-b98d-645c3bdfca78</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29981</th>\n",
       "      <td>从上海到西塘坐车很方便，票价也不贵。冬天的西塘虽然冷，但是放一盏小河灯，许个愿也是棒棒哒。突...</td>\n",
       "      <td>0986165f-e9a5-394c-91d2-57149a4cad4b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29982</th>\n",
       "      <td>观音菩萨的道场，从来都没有遭遇过台风的平静之地。岛还是比较大的，感觉一天游玩还是必须要体力好...</td>\n",
       "      <td>99aeecae-9a3b-337c-ad7b-ab82bbb499b4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29983</th>\n",
       "      <td>西湖美在有秀丽的自然风光，动人的传说，湖光山色塔影相映成趣，连如织的游人也是一道风景。喜欢这...</td>\n",
       "      <td>c0334558-d138-31cb-b714-0b6a320fa903</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29984</th>\n",
       "      <td>一点都不好玩</td>\n",
       "      <td>03f30d5a-9478-3701-b8e0-dec924ffe9a5</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29985</th>\n",
       "      <td>值得一看</td>\n",
       "      <td>a8c15edb-40a9-3014-a021-83d0d747f2d9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29986</th>\n",
       "      <td>【点评有奖第14季】青城山风景很美，寺庙建设很特别，但景区好多没开放</td>\n",
       "      <td>856aefdd-dc4c-36b2-a102-d9333b44186c</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29987</th>\n",
       "      <td>人山人海，太多人了，特别是跟团的</td>\n",
       "      <td>179cf47c-4eb3-3c94-b097-52ab1844d67b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29988</th>\n",
       "      <td>预订票在机器上取，快捷方便。</td>\n",
       "      <td>c5e81b73-0412-3120-a94e-c14f3dc942ab</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29989</th>\n",
       "      <td>烟花三月下扬州，到了瘦西湖才算是到了扬州，算是必选项。</td>\n",
       "      <td>803661ce-e446-32c6-a3f2-707e7d53891d</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29990</th>\n",
       "      <td>秋日的下午来到这里，游人虽然不少，但是也不算特别多，慢慢走，慢慢逛，蛮舒服的！</td>\n",
       "      <td>0c972f48-a836-3bdc-8dd5-fc4ac7d3385e</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29991</th>\n",
       "      <td>新晋5A级国家旅游风景区，渤海广场乘8路公交车终点即是（票价2元）。景区门票80元，观光车全...</td>\n",
       "      <td>927ed129-56ea-33fa-bcd3-f49527693fbb</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29992</th>\n",
       "      <td>很不错，自己爬上去的，走走停停，认识了新的小伙伴。</td>\n",
       "      <td>cda5d912-3bf9-382a-9b3a-5c08bc665360</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29993</th>\n",
       "      <td>有何功德烧香的人是人山人海</td>\n",
       "      <td>035184d7-2f13-32d2-8479-77532cdf6152</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29994</th>\n",
       "      <td>还不错，不过商业化比较严重啦，</td>\n",
       "      <td>275498bc-12b3-3086-ab78-b7d0ea0da2e9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29995</th>\n",
       "      <td>据说这个是后来建的,没有什么历史意义,登上4楼就累的不行,在上面可以看到长江大桥</td>\n",
       "      <td>b4f946fe-0deb-3f7a-a8ae-ff1446818ec0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29996</th>\n",
       "      <td>颐和园也是我喜欢的北京经典景点之一，它长长的回廊，万寿山，七孔桥和碧波荡漾的湖水，共同构成令...</td>\n",
       "      <td>f030f2ed-1a39-32ff-a2dd-14fb757b2cb2</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29997</th>\n",
       "      <td>比较小众的景点，但是真心很美。</td>\n",
       "      <td>647597a2-ec49-3f27-8924-363ddef52ca0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29998</th>\n",
       "      <td>雨天走走，景色宜人。可惜体力有限，走了一个小时不到就回去了。没有全部领略西湖美景</td>\n",
       "      <td>1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29999</th>\n",
       "      <td>160607-08 两日 &lt;br /&gt;n路线:&lt;br /&gt;n07下午:天一巷-骆驼峰-辣椒峰...</td>\n",
       "      <td>b092aa49-4688-3c41-8aec-43c03186567f</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>130000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 Discuss  \\\n",
       "0                  好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的   \n",
       "1                        新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！   \n",
       "2                    庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去   \n",
       "3      个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...   \n",
       "4                                                 迪斯尼一日游   \n",
       "5                                                     方便   \n",
       "6      看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...   \n",
       "7                                                      赞   \n",
       "8                                                   唯一糟点   \n",
       "9                                                  周未周边游   \n",
       "10     景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...   \n",
       "11                                               绍兴护城河夜游   \n",
       "12                                     感觉还不错，作为一日游不错的选择～   \n",
       "13                                            有趣hai xing   \n",
       "14                    荡气回肠，10年去的，居然没有留下来照片，必然要再去！<br />n   \n",
       "15      景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！   \n",
       "16     南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...   \n",
       "17                         个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看   \n",
       "18                                                 性价比超高   \n",
       "19                            挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵   \n",
       "20                太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看   \n",
       "21                                                   迪士尼   \n",
       "22                                                   亲子游   \n",
       "23                  来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。   \n",
       "24                       不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。   \n",
       "25                                               很值得去的地方   \n",
       "26                                               第一次必到景点   \n",
       "27                                                好歹也是长城   \n",
       "28     早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...   \n",
       "29                                 登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！   \n",
       "...                                                  ...   \n",
       "29970  景色很美，原谅我的语言苍白，池水很清，可能是折射也可能是有藻类，整个池水透绿，空气很清新，忍...   \n",
       "29971                                      人好多，人好多，人好多呀！   \n",
       "29972  　　1.西塘管老太臭豆腐，这个是西塘最著名的小吃了，山寨冒牌遍布西塘的角角落落，而且味道也参...   \n",
       "29973  798艺术中心 来朝圣了 到处都是拍照的人，到处都是奇装异服，到处都是行为艺术，到处都是各种...   \n",
       "29974                             心诚则灵，很多拜佛的地方都是有信仰的人才去的   \n",
       "29975  十一月初来的，风很大但是不太冷，坐滑车来回，也不错，长城非常壮观，有时候爬的很累了，但是一回...   \n",
       "29976                                    总体就是一般般吧，不会特别推荐   \n",
       "29977                                      大大的雪场，绚丽多彩的冰灯   \n",
       "29978                                  这是个代表性的建筑，代表性的地点。   \n",
       "29979                     坐地铁直奔王府井，因为急找住的地方把背包放下，就没溜达王府井   \n",
       "29980                             西双版纳热带植物园挺大的，这里很多热带植物！   \n",
       "29981  从上海到西塘坐车很方便，票价也不贵。冬天的西塘虽然冷，但是放一盏小河灯，许个愿也是棒棒哒。突...   \n",
       "29982  观音菩萨的道场，从来都没有遭遇过台风的平静之地。岛还是比较大的，感觉一天游玩还是必须要体力好...   \n",
       "29983  西湖美在有秀丽的自然风光，动人的传说，湖光山色塔影相映成趣，连如织的游人也是一道风景。喜欢这...   \n",
       "29984                                             一点都不好玩   \n",
       "29985                                               值得一看   \n",
       "29986                 【点评有奖第14季】青城山风景很美，寺庙建设很特别，但景区好多没开放   \n",
       "29987                                   人山人海，太多人了，特别是跟团的   \n",
       "29988                                     预订票在机器上取，快捷方便。   \n",
       "29989                        烟花三月下扬州，到了瘦西湖才算是到了扬州，算是必选项。   \n",
       "29990            秋日的下午来到这里，游人虽然不少，但是也不算特别多，慢慢走，慢慢逛，蛮舒服的！   \n",
       "29991  新晋5A级国家旅游风景区，渤海广场乘8路公交车终点即是（票价2元）。景区门票80元，观光车全...   \n",
       "29992                          很不错，自己爬上去的，走走停停，认识了新的小伙伴。   \n",
       "29993                                      有何功德烧香的人是人山人海   \n",
       "29994                                    还不错，不过商业化比较严重啦，   \n",
       "29995           据说这个是后来建的,没有什么历史意义,登上4楼就累的不行,在上面可以看到长江大桥   \n",
       "29996  颐和园也是我喜欢的北京经典景点之一，它长长的回廊，万寿山，七孔桥和碧波荡漾的湖水，共同构成令...   \n",
       "29997                                    比较小众的景点，但是真心很美。   \n",
       "29998           雨天走走，景色宜人。可惜体力有限，走了一个小时不到就回去了。没有全部领略西湖美景   \n",
       "29999  160607-08 两日 <br />n路线:<br />n07下午:天一巷-骆驼峰-辣椒峰...   \n",
       "\n",
       "                                         Id  Score  \n",
       "0      201e8bf2-77a2-3a98-9fcf-4ce03914e712    5.0  \n",
       "1      f4d51947-eac4-3005-9d3c-2f32d6068a2d    4.0  \n",
       "2      74aa7ae4-03a4-394c-bee0-5702d3a3082a    4.0  \n",
       "3      099661c2-4360-3c49-a2fe-8c783764f7db    5.0  \n",
       "4      97ca672d-e558-3542-ba7b-ee719bba1bab    5.0  \n",
       "5      3b7f3f2e-886f-3a68-a810-2c37cfd728d3    4.0  \n",
       "6      88914409-bd13-3d47-b5a2-691177dde8fd    4.0  \n",
       "7      bf13ec92-6079-3451-ade3-88020cb0dcb5    5.0  \n",
       "8      489c3d94-9c44-3cf2-949c-1b507c374c69    5.0  \n",
       "9      285bba78-16a3-3c1d-b648-baa483883ee3    5.0  \n",
       "10     e7801d96-73d0-35c4-9e00-cc15caaa384a    5.0  \n",
       "11     973afeca-7530-3f56-b7f5-bef36d889025    4.0  \n",
       "12     cd91dc2f-2331-3c73-bc8d-da027337270d    5.0  \n",
       "13     7ce97eca-63a8-30a1-9687-6796f34606f1    5.0  \n",
       "14     25e21097-bd41-3589-b12c-62bc7b04eb6d    5.0  \n",
       "15     98e78de7-d5d3-3b30-90d4-a63a6107d532    5.0  \n",
       "16     26334fc8-a4f1-3dc3-adb6-76b99d75cdf9    5.0  \n",
       "17     7f4d6d59-f732-3125-8e7d-8bd64c891b94    3.0  \n",
       "18     61522e3c-5d2a-3088-b60d-159dbc2976ce    5.0  \n",
       "19     37e57244-8d7e-3a1d-8f0f-8b811afb4a6a    3.0  \n",
       "20     81502f08-b884-38b8-8169-7de7a0680a82    4.0  \n",
       "21     533a667c-d6ba-313d-bc29-588b992789e0    2.0  \n",
       "22     041e4056-62f2-3f25-8e3e-8f57f66cb3d8    5.0  \n",
       "23     988f2319-3292-305a-aaaf-ba18bc397e5a    4.0  \n",
       "24     c3d7dd21-79ef-3ff2-b90d-14631e9a30b4    5.0  \n",
       "25     4b12c7b9-059f-3016-a954-3849b0456ce4    5.0  \n",
       "26     5ba1fa45-4c97-3afe-9dd6-3efea9c73a94    5.0  \n",
       "27     f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d    4.0  \n",
       "28     9363fc36-92a7-371f-8d5f-5dd71b565455    5.0  \n",
       "29     a4dc34f1-6a97-3b86-829c-466cdaa86bf2    4.0  \n",
       "...                                     ...    ...  \n",
       "29970  f0de95d2-ef2b-3361-a6df-3f4392d74e93    NaN  \n",
       "29971  d0a33ded-fed9-3d92-9c51-c6512d71a93d    NaN  \n",
       "29972  b00cd736-093a-3d1e-b750-41283e37e09b    NaN  \n",
       "29973  8be4f87a-c76b-3e65-8ecc-a25f209824da    NaN  \n",
       "29974  9079f642-7458-3c6d-9098-c4877cc81347    NaN  \n",
       "29975  f0d47b46-9391-3d0c-b068-8753e4edd583    NaN  \n",
       "29976  8e9afa16-33ae-3f51-8156-1e9e5337605c    NaN  \n",
       "29977  3a19d80a-1cdf-3616-9707-6f8a72b4513b    NaN  \n",
       "29978  73081902-03a0-3f36-b562-900d6fd81f6c    NaN  \n",
       "29979  7fed708a-e036-36cb-98a4-f8de9fbbf3db    NaN  \n",
       "29980  cdb9eed1-54fb-3454-b98d-645c3bdfca78    NaN  \n",
       "29981  0986165f-e9a5-394c-91d2-57149a4cad4b    NaN  \n",
       "29982  99aeecae-9a3b-337c-ad7b-ab82bbb499b4    NaN  \n",
       "29983  c0334558-d138-31cb-b714-0b6a320fa903    NaN  \n",
       "29984  03f30d5a-9478-3701-b8e0-dec924ffe9a5    NaN  \n",
       "29985  a8c15edb-40a9-3014-a021-83d0d747f2d9    NaN  \n",
       "29986  856aefdd-dc4c-36b2-a102-d9333b44186c    NaN  \n",
       "29987  179cf47c-4eb3-3c94-b097-52ab1844d67b    NaN  \n",
       "29988  c5e81b73-0412-3120-a94e-c14f3dc942ab    NaN  \n",
       "29989  803661ce-e446-32c6-a3f2-707e7d53891d    NaN  \n",
       "29990  0c972f48-a836-3bdc-8dd5-fc4ac7d3385e    NaN  \n",
       "29991  927ed129-56ea-33fa-bcd3-f49527693fbb    NaN  \n",
       "29992  cda5d912-3bf9-382a-9b3a-5c08bc665360    NaN  \n",
       "29993  035184d7-2f13-32d2-8479-77532cdf6152    NaN  \n",
       "29994  275498bc-12b3-3086-ab78-b7d0ea0da2e9    NaN  \n",
       "29995  b4f946fe-0deb-3f7a-a8ae-ff1446818ec0    NaN  \n",
       "29996  f030f2ed-1a39-32ff-a2dd-14fb757b2cb2    NaN  \n",
       "29997  647597a2-ec49-3f27-8924-363ddef52ca0    NaN  \n",
       "29998  1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37    NaN  \n",
       "29999  b092aa49-4688-3c41-8aec-43c03186567f    NaN  \n",
       "\n",
       "[130000 rows x 3 columns]"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['comments'] = data['Discuss'].apply(lambda x : ' '.join(jieba.cut(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Id</th>\n",
       "      <th>Score</th>\n",
       "      <th>comments</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>5.0</td>\n",
       "      <td>好大 的 一个 游乐 公园 ， 已经 去 了 2 次 ， 但 感觉 还 没有 玩够 似的 ！...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>4.0</td>\n",
       "      <td>新 中国 成立 也 是 在 这 举行 ， 对 我们 中国 人 来说 有些 重要 及 深刻 的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>4.0</td>\n",
       "      <td>庐山 瀑布 非常 有名 ， 也 有 非常 多个 瀑布 ， 只是 最 好看 的 非 三叠 泉莫...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>5.0</td>\n",
       "      <td>个人 觉得 颐和园 是 北京 最值 的 一起 的 地方 ， 不过 相比 下 门票 也 是 最...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>5.0</td>\n",
       "      <td>迪斯尼 一日游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>方便</td>\n",
       "      <td>3b7f3f2e-886f-3a68-a810-2c37cfd728d3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>方便</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...</td>\n",
       "      <td>88914409-bd13-3d47-b5a2-691177dde8fd</td>\n",
       "      <td>4.0</td>\n",
       "      <td>看水 看山 都 可以 。 感受 古人 的 智慧结晶 ， 秋景 美丽 如画 ， 红黄绿 相间 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>赞</td>\n",
       "      <td>bf13ec92-6079-3451-ade3-88020cb0dcb5</td>\n",
       "      <td>5.0</td>\n",
       "      <td>赞</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>唯一糟点</td>\n",
       "      <td>489c3d94-9c44-3cf2-949c-1b507c374c69</td>\n",
       "      <td>5.0</td>\n",
       "      <td>唯一 糟点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>周未周边游</td>\n",
       "      <td>285bba78-16a3-3c1d-b648-baa483883ee3</td>\n",
       "      <td>5.0</td>\n",
       "      <td>周未 周边游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...</td>\n",
       "      <td>e7801d96-73d0-35c4-9e00-cc15caaa384a</td>\n",
       "      <td>5.0</td>\n",
       "      <td>景点 服务 不错 ， 就是 排队   太长 了 ， 好玩 的 项目 都 是 人 ， 晚上 的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>绍兴护城河夜游</td>\n",
       "      <td>973afeca-7530-3f56-b7f5-bef36d889025</td>\n",
       "      <td>4.0</td>\n",
       "      <td>绍兴 护城河 夜游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>感觉还不错，作为一日游不错的选择～</td>\n",
       "      <td>cd91dc2f-2331-3c73-bc8d-da027337270d</td>\n",
       "      <td>5.0</td>\n",
       "      <td>感觉 还 不错 ， 作为 一日游 不错 的 选择 ～</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>有趣hai xing</td>\n",
       "      <td>7ce97eca-63a8-30a1-9687-6796f34606f1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>有趣 hai   xing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>荡气回肠，10年去的，居然没有留下来照片，必然要再去！&lt;br /&gt;n</td>\n",
       "      <td>25e21097-bd41-3589-b12c-62bc7b04eb6d</td>\n",
       "      <td>5.0</td>\n",
       "      <td>荡气回肠 ， 10 年 去 的 ， 居然 没有 留下来 照片 ， 必然 要 再 去 ！ &lt; ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！</td>\n",
       "      <td>98e78de7-d5d3-3b30-90d4-a63a6107d532</td>\n",
       "      <td>5.0</td>\n",
       "      <td>景色 超级 棒 ， 有 美丽 的 故事 ， 可以 乘船 游览 ， 也 可以 沿湖 浏览 ， ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...</td>\n",
       "      <td>26334fc8-a4f1-3dc3-adb6-76b99d75cdf9</td>\n",
       "      <td>5.0</td>\n",
       "      <td>南锣鼓巷 是 北京市 中心 一条 老 胡同 ， 因为 其 地理位置 靠近 什刹海 ， 成为 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看</td>\n",
       "      <td>7f4d6d59-f732-3125-8e7d-8bd64c891b94</td>\n",
       "      <td>3.0</td>\n",
       "      <td>个人感觉 就是 个 卖 小商品 的 地方 ， 还 不 便宜 ， 但是 晚上 夜景 挺 好看</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>性价比超高</td>\n",
       "      <td>61522e3c-5d2a-3088-b60d-159dbc2976ce</td>\n",
       "      <td>5.0</td>\n",
       "      <td>性价比 超高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵</td>\n",
       "      <td>37e57244-8d7e-3a1d-8f0f-8b811afb4a6a</td>\n",
       "      <td>3.0</td>\n",
       "      <td>挺 普通 的 吧 ， 就 在 楼下 拍 了 几张 图片 ， 反正 也 是 进不去 的 呵</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看</td>\n",
       "      <td>81502f08-b884-38b8-8169-7de7a0680a82</td>\n",
       "      <td>4.0</td>\n",
       "      <td>太大 了 ， 在 里面 走 了 好长时间 也 就 看 了 不到 五分之一 。 但 周围 交通...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>迪士尼</td>\n",
       "      <td>533a667c-d6ba-313d-bc29-588b992789e0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>迪士尼</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>亲子游</td>\n",
       "      <td>041e4056-62f2-3f25-8e3e-8f57f66cb3d8</td>\n",
       "      <td>5.0</td>\n",
       "      <td>亲子 游</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。</td>\n",
       "      <td>988f2319-3292-305a-aaaf-ba18bc397e5a</td>\n",
       "      <td>4.0</td>\n",
       "      <td>来 苏州 总是 要 欣赏 一下 古典 园林 的 。 可惜 对 园林 不太 感冒 。 逛逛 玩...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。</td>\n",
       "      <td>c3d7dd21-79ef-3ff2-b90d-14631e9a30b4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>不到长城非好汉 ， 对于 爬 过 华山 的 我 来说 ， 长城 太 简单 了 ， 值得 一去 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>很值得去的地方</td>\n",
       "      <td>4b12c7b9-059f-3016-a954-3849b0456ce4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>很 值得 去 的 地方</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>第一次必到景点</td>\n",
       "      <td>5ba1fa45-4c97-3afe-9dd6-3efea9c73a94</td>\n",
       "      <td>5.0</td>\n",
       "      <td>第一次 必到 景点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>好歹也是长城</td>\n",
       "      <td>f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d</td>\n",
       "      <td>4.0</td>\n",
       "      <td>好歹 也 是 长城</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...</td>\n",
       "      <td>9363fc36-92a7-371f-8d5f-5dd71b565455</td>\n",
       "      <td>5.0</td>\n",
       "      <td>早上 一大早 就 起床 去 看 升国旗 ， 很 庄严 ， 很 整齐 ， 就是 像 一个 节目...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！</td>\n",
       "      <td>a4dc34f1-6a97-3b86-829c-466cdaa86bf2</td>\n",
       "      <td>4.0</td>\n",
       "      <td>登顶 是 俯瞰 故宫 的 绝佳 之 处 ， 崇祯帝 在 此 自缢 ！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29970</th>\n",
       "      <td>景色很美，原谅我的语言苍白，池水很清，可能是折射也可能是有藻类，整个池水透绿，空气很清新，忍...</td>\n",
       "      <td>f0de95d2-ef2b-3361-a6df-3f4392d74e93</td>\n",
       "      <td>NaN</td>\n",
       "      <td>景色 很 美 ， 原谅 我 的 语言 苍白 ， 池水 很清 ， 可能 是 折射 也 可能 是...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29971</th>\n",
       "      <td>人好多，人好多，人好多呀！</td>\n",
       "      <td>d0a33ded-fed9-3d92-9c51-c6512d71a93d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>人 好多 ， 人 好多 ， 人 好多 呀 ！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29972</th>\n",
       "      <td>1.西塘管老太臭豆腐，这个是西塘最著名的小吃了，山寨冒牌遍布西塘的角角落落，而且味道也参...</td>\n",
       "      <td>b00cd736-093a-3d1e-b750-41283e37e09b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1 . 西塘 管 老太 臭豆腐 ， 这个 是 西塘 最 著名 的 小吃 了 ， 山寨...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29973</th>\n",
       "      <td>798艺术中心 来朝圣了 到处都是拍照的人，到处都是奇装异服，到处都是行为艺术，到处都是各种...</td>\n",
       "      <td>8be4f87a-c76b-3e65-8ecc-a25f209824da</td>\n",
       "      <td>NaN</td>\n",
       "      <td>798 艺术 中心   来 朝圣 了   到处 都 是 拍照 的 人 ， 到处 都 是 奇装...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29974</th>\n",
       "      <td>心诚则灵，很多拜佛的地方都是有信仰的人才去的</td>\n",
       "      <td>9079f642-7458-3c6d-9098-c4877cc81347</td>\n",
       "      <td>NaN</td>\n",
       "      <td>心诚则灵 ， 很多 拜佛 的 地方 都 是 有 信仰 的 人才 去 的</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29975</th>\n",
       "      <td>十一月初来的，风很大但是不太冷，坐滑车来回，也不错，长城非常壮观，有时候爬的很累了，但是一回...</td>\n",
       "      <td>f0d47b46-9391-3d0c-b068-8753e4edd583</td>\n",
       "      <td>NaN</td>\n",
       "      <td>十一月 初来 的 ， 风 很大 但是 不 太冷 ， 坐 滑车 来回 ， 也 不错 ， 长城 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29976</th>\n",
       "      <td>总体就是一般般吧，不会特别推荐</td>\n",
       "      <td>8e9afa16-33ae-3f51-8156-1e9e5337605c</td>\n",
       "      <td>NaN</td>\n",
       "      <td>总体 就是 一般般 吧 ， 不会 特别 推荐</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29977</th>\n",
       "      <td>大大的雪场，绚丽多彩的冰灯</td>\n",
       "      <td>3a19d80a-1cdf-3616-9707-6f8a72b4513b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>大大的 雪场 ， 绚丽多彩 的 冰灯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29978</th>\n",
       "      <td>这是个代表性的建筑，代表性的地点。</td>\n",
       "      <td>73081902-03a0-3f36-b562-900d6fd81f6c</td>\n",
       "      <td>NaN</td>\n",
       "      <td>这 是 个 代表性 的 建筑 ， 代表性 的 地点 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29979</th>\n",
       "      <td>坐地铁直奔王府井，因为急找住的地方把背包放下，就没溜达王府井</td>\n",
       "      <td>7fed708a-e036-36cb-98a4-f8de9fbbf3db</td>\n",
       "      <td>NaN</td>\n",
       "      <td>坐地铁 直奔 王府井 ， 因为 急 找 住 的 地方 把 背包 放下 ， 就 没 溜达 王府井</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29980</th>\n",
       "      <td>西双版纳热带植物园挺大的，这里很多热带植物！</td>\n",
       "      <td>cdb9eed1-54fb-3454-b98d-645c3bdfca78</td>\n",
       "      <td>NaN</td>\n",
       "      <td>西双版纳 热带 植物园 挺大 的 ， 这里 很多 热带植物 ！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29981</th>\n",
       "      <td>从上海到西塘坐车很方便，票价也不贵。冬天的西塘虽然冷，但是放一盏小河灯，许个愿也是棒棒哒。突...</td>\n",
       "      <td>0986165f-e9a5-394c-91d2-57149a4cad4b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>从 上海 到 西塘 坐车 很 方便 ， 票价 也 不贵 。 冬天 的 西塘 虽然 冷 ， 但...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29982</th>\n",
       "      <td>观音菩萨的道场，从来都没有遭遇过台风的平静之地。岛还是比较大的，感觉一天游玩还是必须要体力好...</td>\n",
       "      <td>99aeecae-9a3b-337c-ad7b-ab82bbb499b4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>观音菩萨 的 道场 ， 从来 都 没有 遭遇 过 台风 的 平静 之地 。 岛 还是 比较 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29983</th>\n",
       "      <td>西湖美在有秀丽的自然风光，动人的传说，湖光山色塔影相映成趣，连如织的游人也是一道风景。喜欢这...</td>\n",
       "      <td>c0334558-d138-31cb-b714-0b6a320fa903</td>\n",
       "      <td>NaN</td>\n",
       "      <td>西湖 美在 有 秀丽 的 自然风光 ， 动人 的 传说 ， 湖光山色 塔影 相映成趣 ， 连...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29984</th>\n",
       "      <td>一点都不好玩</td>\n",
       "      <td>03f30d5a-9478-3701-b8e0-dec924ffe9a5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>一点 都 不好玩</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29985</th>\n",
       "      <td>值得一看</td>\n",
       "      <td>a8c15edb-40a9-3014-a021-83d0d747f2d9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>值得一看</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29986</th>\n",
       "      <td>【点评有奖第14季】青城山风景很美，寺庙建设很特别，但景区好多没开放</td>\n",
       "      <td>856aefdd-dc4c-36b2-a102-d9333b44186c</td>\n",
       "      <td>NaN</td>\n",
       "      <td>【 点评 有奖 第 14 季 】 青城山 风景 很 美 ， 寺庙 建设 很 特别 ， 但 景...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29987</th>\n",
       "      <td>人山人海，太多人了，特别是跟团的</td>\n",
       "      <td>179cf47c-4eb3-3c94-b097-52ab1844d67b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>人山人海 ， 太多人 了 ， 特别 是 跟 团 的</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29988</th>\n",
       "      <td>预订票在机器上取，快捷方便。</td>\n",
       "      <td>c5e81b73-0412-3120-a94e-c14f3dc942ab</td>\n",
       "      <td>NaN</td>\n",
       "      <td>预订 票 在 机器 上取 ， 快捷 方便 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29989</th>\n",
       "      <td>烟花三月下扬州，到了瘦西湖才算是到了扬州，算是必选项。</td>\n",
       "      <td>803661ce-e446-32c6-a3f2-707e7d53891d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>烟花 三月 下 扬州 ， 到 了 瘦西湖 才 算是 到 了 扬州 ， 算是 必 选项 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29990</th>\n",
       "      <td>秋日的下午来到这里，游人虽然不少，但是也不算特别多，慢慢走，慢慢逛，蛮舒服的！</td>\n",
       "      <td>0c972f48-a836-3bdc-8dd5-fc4ac7d3385e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>秋日 的 下午 来到 这里 ， 游人 虽然 不少 ， 但是 也 不算 特别 多 ， 慢慢 走...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29991</th>\n",
       "      <td>新晋5A级国家旅游风景区，渤海广场乘8路公交车终点即是（票价2元）。景区门票80元，观光车全...</td>\n",
       "      <td>927ed129-56ea-33fa-bcd3-f49527693fbb</td>\n",
       "      <td>NaN</td>\n",
       "      <td>新晋 5A 级 国家 旅游 风景区 ， 渤海 广场 乘 8 路 公交车 终点 即 是 （ 票...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29992</th>\n",
       "      <td>很不错，自己爬上去的，走走停停，认识了新的小伙伴。</td>\n",
       "      <td>cda5d912-3bf9-382a-9b3a-5c08bc665360</td>\n",
       "      <td>NaN</td>\n",
       "      <td>很 不错 ， 自己 爬上去 的 ， 走走停停 ， 认识 了 新 的 小伙伴 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29993</th>\n",
       "      <td>有何功德烧香的人是人山人海</td>\n",
       "      <td>035184d7-2f13-32d2-8479-77532cdf6152</td>\n",
       "      <td>NaN</td>\n",
       "      <td>有何 功德 烧香 的 人 是 人山人海</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29994</th>\n",
       "      <td>还不错，不过商业化比较严重啦，</td>\n",
       "      <td>275498bc-12b3-3086-ab78-b7d0ea0da2e9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>还 不错 ， 不过 商业化 比较严重 啦 ，</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29995</th>\n",
       "      <td>据说这个是后来建的,没有什么历史意义,登上4楼就累的不行,在上面可以看到长江大桥</td>\n",
       "      <td>b4f946fe-0deb-3f7a-a8ae-ff1446818ec0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>据说 这个 是 后来 建 的 , 没有 什么 历史 意义 , 登上 4 楼 就 累 的 不行...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29996</th>\n",
       "      <td>颐和园也是我喜欢的北京经典景点之一，它长长的回廊，万寿山，七孔桥和碧波荡漾的湖水，共同构成令...</td>\n",
       "      <td>f030f2ed-1a39-32ff-a2dd-14fb757b2cb2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>颐和园 也 是 我 喜欢 的 北京 经典 景点 之一 ， 它 长长的 回廊 ， 万寿山 ， ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29997</th>\n",
       "      <td>比较小众的景点，但是真心很美。</td>\n",
       "      <td>647597a2-ec49-3f27-8924-363ddef52ca0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>比较 小众 的 景点 ， 但是 真心 很 美 。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29998</th>\n",
       "      <td>雨天走走，景色宜人。可惜体力有限，走了一个小时不到就回去了。没有全部领略西湖美景</td>\n",
       "      <td>1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37</td>\n",
       "      <td>NaN</td>\n",
       "      <td>雨天 走走 ， 景色宜人 。 可惜 体力 有限 ， 走 了 一个 小时 不到 就 回去 了 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29999</th>\n",
       "      <td>160607-08 两日 &lt;br /&gt;n路线:&lt;br /&gt;n07下午:天一巷-骆驼峰-辣椒峰...</td>\n",
       "      <td>b092aa49-4688-3c41-8aec-43c03186567f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>160607 - 08   两日   &lt; br   / &gt; n 路线 : &lt; br   / ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>130000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 Discuss  \\\n",
       "0                  好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的   \n",
       "1                        新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！   \n",
       "2                    庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去   \n",
       "3      个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...   \n",
       "4                                                 迪斯尼一日游   \n",
       "5                                                     方便   \n",
       "6      看水看山都可以。感受古人的智慧结晶，秋景美丽如画，红黄绿相间！对于身体状况不佳的人来说，走平...   \n",
       "7                                                      赞   \n",
       "8                                                   唯一糟点   \n",
       "9                                                  周未周边游   \n",
       "10     景点服务不错，就是排队 太长了，好玩的项目都是人，晚上的烟火一定jrvytqlamf要看，真...   \n",
       "11                                               绍兴护城河夜游   \n",
       "12                                     感觉还不错，作为一日游不错的选择～   \n",
       "13                                            有趣hai xing   \n",
       "14                    荡气回肠，10年去的，居然没有留下来照片，必然要再去！<br />n   \n",
       "15      景色超级棒，有美丽的故事，可以乘船游览，也可以沿湖浏览，累了可以乘坐观光车！关键是没有门票！！！   \n",
       "16     南锣鼓巷是北京市中心一条老胡同，因为其地理位置靠近什刹海，成为北京休闲娱乐的好去处，特别是外...   \n",
       "17                         个人感觉就是个卖小商品的地方，还不便宜，但是晚上夜景挺好看   \n",
       "18                                                 性价比超高   \n",
       "19                            挺普通的吧，就在楼下拍了几张图片，反正也是进不去的呵   \n",
       "20                太大了，在里面走了好长时间也就看了不到五分之一。但周围交通方便，值得去看一看   \n",
       "21                                                   迪士尼   \n",
       "22                                                   亲子游   \n",
       "23                  来苏州总是要欣赏一下古典园林的。可惜对园林不太感冒。逛逛玩玩还是不错的。   \n",
       "24                       不到长城非好汉，对于爬过华山的我来说，长城太简单了，值得一去。   \n",
       "25                                               很值得去的地方   \n",
       "26                                               第一次必到景点   \n",
       "27                                                好歹也是长城   \n",
       "28     早上一大早就起床去看升国旗，很庄严，很整齐，就是像一个节目让人转不开眼睛，特别是老辈的人听说...   \n",
       "29                                 登顶是俯瞰故宫的绝佳之处，崇祯帝在此自缢！   \n",
       "...                                                  ...   \n",
       "29970  景色很美，原谅我的语言苍白，池水很清，可能是折射也可能是有藻类，整个池水透绿，空气很清新，忍...   \n",
       "29971                                      人好多，人好多，人好多呀！   \n",
       "29972  　　1.西塘管老太臭豆腐，这个是西塘最著名的小吃了，山寨冒牌遍布西塘的角角落落，而且味道也参...   \n",
       "29973  798艺术中心 来朝圣了 到处都是拍照的人，到处都是奇装异服，到处都是行为艺术，到处都是各种...   \n",
       "29974                             心诚则灵，很多拜佛的地方都是有信仰的人才去的   \n",
       "29975  十一月初来的，风很大但是不太冷，坐滑车来回，也不错，长城非常壮观，有时候爬的很累了，但是一回...   \n",
       "29976                                    总体就是一般般吧，不会特别推荐   \n",
       "29977                                      大大的雪场，绚丽多彩的冰灯   \n",
       "29978                                  这是个代表性的建筑，代表性的地点。   \n",
       "29979                     坐地铁直奔王府井，因为急找住的地方把背包放下，就没溜达王府井   \n",
       "29980                             西双版纳热带植物园挺大的，这里很多热带植物！   \n",
       "29981  从上海到西塘坐车很方便，票价也不贵。冬天的西塘虽然冷，但是放一盏小河灯，许个愿也是棒棒哒。突...   \n",
       "29982  观音菩萨的道场，从来都没有遭遇过台风的平静之地。岛还是比较大的，感觉一天游玩还是必须要体力好...   \n",
       "29983  西湖美在有秀丽的自然风光，动人的传说，湖光山色塔影相映成趣，连如织的游人也是一道风景。喜欢这...   \n",
       "29984                                             一点都不好玩   \n",
       "29985                                               值得一看   \n",
       "29986                 【点评有奖第14季】青城山风景很美，寺庙建设很特别，但景区好多没开放   \n",
       "29987                                   人山人海，太多人了，特别是跟团的   \n",
       "29988                                     预订票在机器上取，快捷方便。   \n",
       "29989                        烟花三月下扬州，到了瘦西湖才算是到了扬州，算是必选项。   \n",
       "29990            秋日的下午来到这里，游人虽然不少，但是也不算特别多，慢慢走，慢慢逛，蛮舒服的！   \n",
       "29991  新晋5A级国家旅游风景区，渤海广场乘8路公交车终点即是（票价2元）。景区门票80元，观光车全...   \n",
       "29992                          很不错，自己爬上去的，走走停停，认识了新的小伙伴。   \n",
       "29993                                      有何功德烧香的人是人山人海   \n",
       "29994                                    还不错，不过商业化比较严重啦，   \n",
       "29995           据说这个是后来建的,没有什么历史意义,登上4楼就累的不行,在上面可以看到长江大桥   \n",
       "29996  颐和园也是我喜欢的北京经典景点之一，它长长的回廊，万寿山，七孔桥和碧波荡漾的湖水，共同构成令...   \n",
       "29997                                    比较小众的景点，但是真心很美。   \n",
       "29998           雨天走走，景色宜人。可惜体力有限，走了一个小时不到就回去了。没有全部领略西湖美景   \n",
       "29999  160607-08 两日 <br />n路线:<br />n07下午:天一巷-骆驼峰-辣椒峰...   \n",
       "\n",
       "                                         Id  Score  \\\n",
       "0      201e8bf2-77a2-3a98-9fcf-4ce03914e712    5.0   \n",
       "1      f4d51947-eac4-3005-9d3c-2f32d6068a2d    4.0   \n",
       "2      74aa7ae4-03a4-394c-bee0-5702d3a3082a    4.0   \n",
       "3      099661c2-4360-3c49-a2fe-8c783764f7db    5.0   \n",
       "4      97ca672d-e558-3542-ba7b-ee719bba1bab    5.0   \n",
       "5      3b7f3f2e-886f-3a68-a810-2c37cfd728d3    4.0   \n",
       "6      88914409-bd13-3d47-b5a2-691177dde8fd    4.0   \n",
       "7      bf13ec92-6079-3451-ade3-88020cb0dcb5    5.0   \n",
       "8      489c3d94-9c44-3cf2-949c-1b507c374c69    5.0   \n",
       "9      285bba78-16a3-3c1d-b648-baa483883ee3    5.0   \n",
       "10     e7801d96-73d0-35c4-9e00-cc15caaa384a    5.0   \n",
       "11     973afeca-7530-3f56-b7f5-bef36d889025    4.0   \n",
       "12     cd91dc2f-2331-3c73-bc8d-da027337270d    5.0   \n",
       "13     7ce97eca-63a8-30a1-9687-6796f34606f1    5.0   \n",
       "14     25e21097-bd41-3589-b12c-62bc7b04eb6d    5.0   \n",
       "15     98e78de7-d5d3-3b30-90d4-a63a6107d532    5.0   \n",
       "16     26334fc8-a4f1-3dc3-adb6-76b99d75cdf9    5.0   \n",
       "17     7f4d6d59-f732-3125-8e7d-8bd64c891b94    3.0   \n",
       "18     61522e3c-5d2a-3088-b60d-159dbc2976ce    5.0   \n",
       "19     37e57244-8d7e-3a1d-8f0f-8b811afb4a6a    3.0   \n",
       "20     81502f08-b884-38b8-8169-7de7a0680a82    4.0   \n",
       "21     533a667c-d6ba-313d-bc29-588b992789e0    2.0   \n",
       "22     041e4056-62f2-3f25-8e3e-8f57f66cb3d8    5.0   \n",
       "23     988f2319-3292-305a-aaaf-ba18bc397e5a    4.0   \n",
       "24     c3d7dd21-79ef-3ff2-b90d-14631e9a30b4    5.0   \n",
       "25     4b12c7b9-059f-3016-a954-3849b0456ce4    5.0   \n",
       "26     5ba1fa45-4c97-3afe-9dd6-3efea9c73a94    5.0   \n",
       "27     f6d82a8c-ef72-3a0a-95ca-95aa2fbb7f7d    4.0   \n",
       "28     9363fc36-92a7-371f-8d5f-5dd71b565455    5.0   \n",
       "29     a4dc34f1-6a97-3b86-829c-466cdaa86bf2    4.0   \n",
       "...                                     ...    ...   \n",
       "29970  f0de95d2-ef2b-3361-a6df-3f4392d74e93    NaN   \n",
       "29971  d0a33ded-fed9-3d92-9c51-c6512d71a93d    NaN   \n",
       "29972  b00cd736-093a-3d1e-b750-41283e37e09b    NaN   \n",
       "29973  8be4f87a-c76b-3e65-8ecc-a25f209824da    NaN   \n",
       "29974  9079f642-7458-3c6d-9098-c4877cc81347    NaN   \n",
       "29975  f0d47b46-9391-3d0c-b068-8753e4edd583    NaN   \n",
       "29976  8e9afa16-33ae-3f51-8156-1e9e5337605c    NaN   \n",
       "29977  3a19d80a-1cdf-3616-9707-6f8a72b4513b    NaN   \n",
       "29978  73081902-03a0-3f36-b562-900d6fd81f6c    NaN   \n",
       "29979  7fed708a-e036-36cb-98a4-f8de9fbbf3db    NaN   \n",
       "29980  cdb9eed1-54fb-3454-b98d-645c3bdfca78    NaN   \n",
       "29981  0986165f-e9a5-394c-91d2-57149a4cad4b    NaN   \n",
       "29982  99aeecae-9a3b-337c-ad7b-ab82bbb499b4    NaN   \n",
       "29983  c0334558-d138-31cb-b714-0b6a320fa903    NaN   \n",
       "29984  03f30d5a-9478-3701-b8e0-dec924ffe9a5    NaN   \n",
       "29985  a8c15edb-40a9-3014-a021-83d0d747f2d9    NaN   \n",
       "29986  856aefdd-dc4c-36b2-a102-d9333b44186c    NaN   \n",
       "29987  179cf47c-4eb3-3c94-b097-52ab1844d67b    NaN   \n",
       "29988  c5e81b73-0412-3120-a94e-c14f3dc942ab    NaN   \n",
       "29989  803661ce-e446-32c6-a3f2-707e7d53891d    NaN   \n",
       "29990  0c972f48-a836-3bdc-8dd5-fc4ac7d3385e    NaN   \n",
       "29991  927ed129-56ea-33fa-bcd3-f49527693fbb    NaN   \n",
       "29992  cda5d912-3bf9-382a-9b3a-5c08bc665360    NaN   \n",
       "29993  035184d7-2f13-32d2-8479-77532cdf6152    NaN   \n",
       "29994  275498bc-12b3-3086-ab78-b7d0ea0da2e9    NaN   \n",
       "29995  b4f946fe-0deb-3f7a-a8ae-ff1446818ec0    NaN   \n",
       "29996  f030f2ed-1a39-32ff-a2dd-14fb757b2cb2    NaN   \n",
       "29997  647597a2-ec49-3f27-8924-363ddef52ca0    NaN   \n",
       "29998  1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37    NaN   \n",
       "29999  b092aa49-4688-3c41-8aec-43c03186567f    NaN   \n",
       "\n",
       "                                                comments  \n",
       "0      好大 的 一个 游乐 公园 ， 已经 去 了 2 次 ， 但 感觉 还 没有 玩够 似的 ！...  \n",
       "1      新 中国 成立 也 是 在 这 举行 ， 对 我们 中国 人 来说 有些 重要 及 深刻 的...  \n",
       "2      庐山 瀑布 非常 有名 ， 也 有 非常 多个 瀑布 ， 只是 最 好看 的 非 三叠 泉莫...  \n",
       "3      个人 觉得 颐和园 是 北京 最值 的 一起 的 地方 ， 不过 相比 下 门票 也 是 最...  \n",
       "4                                                迪斯尼 一日游  \n",
       "5                                                     方便  \n",
       "6      看水 看山 都 可以 。 感受 古人 的 智慧结晶 ， 秋景 美丽 如画 ， 红黄绿 相间 ...  \n",
       "7                                                      赞  \n",
       "8                                                  唯一 糟点  \n",
       "9                                                 周未 周边游  \n",
       "10     景点 服务 不错 ， 就是 排队   太长 了 ， 好玩 的 项目 都 是 人 ， 晚上 的...  \n",
       "11                                             绍兴 护城河 夜游  \n",
       "12                            感觉 还 不错 ， 作为 一日游 不错 的 选择 ～  \n",
       "13                                         有趣 hai   xing  \n",
       "14     荡气回肠 ， 10 年 去 的 ， 居然 没有 留下来 照片 ， 必然 要 再 去 ！ < ...  \n",
       "15     景色 超级 棒 ， 有 美丽 的 故事 ， 可以 乘船 游览 ， 也 可以 沿湖 浏览 ， ...  \n",
       "16     南锣鼓巷 是 北京市 中心 一条 老 胡同 ， 因为 其 地理位置 靠近 什刹海 ， 成为 ...  \n",
       "17         个人感觉 就是 个 卖 小商品 的 地方 ， 还 不 便宜 ， 但是 晚上 夜景 挺 好看  \n",
       "18                                                性价比 超高  \n",
       "19          挺 普通 的 吧 ， 就 在 楼下 拍 了 几张 图片 ， 反正 也 是 进不去 的 呵  \n",
       "20     太大 了 ， 在 里面 走 了 好长时间 也 就 看 了 不到 五分之一 。 但 周围 交通...  \n",
       "21                                                   迪士尼  \n",
       "22                                                  亲子 游  \n",
       "23     来 苏州 总是 要 欣赏 一下 古典 园林 的 。 可惜 对 园林 不太 感冒 。 逛逛 玩...  \n",
       "24      不到长城非好汉 ， 对于 爬 过 华山 的 我 来说 ， 长城 太 简单 了 ， 值得 一去 。  \n",
       "25                                           很 值得 去 的 地方  \n",
       "26                                             第一次 必到 景点  \n",
       "27                                             好歹 也 是 长城  \n",
       "28     早上 一大早 就 起床 去 看 升国旗 ， 很 庄严 ， 很 整齐 ， 就是 像 一个 节目...  \n",
       "29                    登顶 是 俯瞰 故宫 的 绝佳 之 处 ， 崇祯帝 在 此 自缢 ！  \n",
       "...                                                  ...  \n",
       "29970  景色 很 美 ， 原谅 我 的 语言 苍白 ， 池水 很清 ， 可能 是 折射 也 可能 是...  \n",
       "29971                             人 好多 ， 人 好多 ， 人 好多 呀 ！  \n",
       "29972  　 　 1 . 西塘 管 老太 臭豆腐 ， 这个 是 西塘 最 著名 的 小吃 了 ， 山寨...  \n",
       "29973  798 艺术 中心   来 朝圣 了   到处 都 是 拍照 的 人 ， 到处 都 是 奇装...  \n",
       "29974                心诚则灵 ， 很多 拜佛 的 地方 都 是 有 信仰 的 人才 去 的  \n",
       "29975  十一月 初来 的 ， 风 很大 但是 不 太冷 ， 坐 滑车 来回 ， 也 不错 ， 长城 ...  \n",
       "29976                             总体 就是 一般般 吧 ， 不会 特别 推荐  \n",
       "29977                                 大大的 雪场 ， 绚丽多彩 的 冰灯  \n",
       "29978                        这 是 个 代表性 的 建筑 ， 代表性 的 地点 。  \n",
       "29979    坐地铁 直奔 王府井 ， 因为 急 找 住 的 地方 把 背包 放下 ， 就 没 溜达 王府井  \n",
       "29980                    西双版纳 热带 植物园 挺大 的 ， 这里 很多 热带植物 ！  \n",
       "29981  从 上海 到 西塘 坐车 很 方便 ， 票价 也 不贵 。 冬天 的 西塘 虽然 冷 ， 但...  \n",
       "29982  观音菩萨 的 道场 ， 从来 都 没有 遭遇 过 台风 的 平静 之地 。 岛 还是 比较 ...  \n",
       "29983  西湖 美在 有 秀丽 的 自然风光 ， 动人 的 传说 ， 湖光山色 塔影 相映成趣 ， 连...  \n",
       "29984                                           一点 都 不好玩  \n",
       "29985                                               值得一看  \n",
       "29986  【 点评 有奖 第 14 季 】 青城山 风景 很 美 ， 寺庙 建设 很 特别 ， 但 景...  \n",
       "29987                          人山人海 ， 太多人 了 ， 特别 是 跟 团 的  \n",
       "29988                             预订 票 在 机器 上取 ， 快捷 方便 。  \n",
       "29989       烟花 三月 下 扬州 ， 到 了 瘦西湖 才 算是 到 了 扬州 ， 算是 必 选项 。  \n",
       "29990  秋日 的 下午 来到 这里 ， 游人 虽然 不少 ， 但是 也 不算 特别 多 ， 慢慢 走...  \n",
       "29991  新晋 5A 级 国家 旅游 风景区 ， 渤海 广场 乘 8 路 公交车 终点 即 是 （ 票...  \n",
       "29992            很 不错 ， 自己 爬上去 的 ， 走走停停 ， 认识 了 新 的 小伙伴 。  \n",
       "29993                                有何 功德 烧香 的 人 是 人山人海  \n",
       "29994                             还 不错 ， 不过 商业化 比较严重 啦 ，  \n",
       "29995  据说 这个 是 后来 建 的 , 没有 什么 历史 意义 , 登上 4 楼 就 累 的 不行...  \n",
       "29996  颐和园 也 是 我 喜欢 的 北京 经典 景点 之一 ， 它 长长的 回廊 ， 万寿山 ， ...  \n",
       "29997                           比较 小众 的 景点 ， 但是 真心 很 美 。  \n",
       "29998  雨天 走走 ， 景色宜人 。 可惜 体力 有限 ， 走 了 一个 小时 不到 就 回去 了 ...  \n",
       "29999  160607 - 08   两日   < br   / > n 路线 : < br   / ...  \n",
       "\n",
       "[130000 rows x 4 columns]"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        好大 的 一个 游乐 公园 ， 已经 去 了 2 次 ， 但 感觉 还 没有 玩够 似的 ！...\n",
       "1        新 中国 成立 也 是 在 这 举行 ， 对 我们 中国 人 来说 有些 重要 及 深刻 的...\n",
       "2        庐山 瀑布 非常 有名 ， 也 有 非常 多个 瀑布 ， 只是 最 好看 的 非 三叠 泉莫...\n",
       "3        个人 觉得 颐和园 是 北京 最值 的 一起 的 地方 ， 不过 相比 下 门票 也 是 最...\n",
       "4                                                  迪斯尼 一日游\n",
       "5                                                       方便\n",
       "6        看水 看山 都 可以 。 感受 古人 的 智慧结晶 ， 秋景 美丽 如画 ， 红黄绿 相间 ...\n",
       "7                                                        赞\n",
       "8                                                    唯一 糟点\n",
       "9                                                   周未 周边游\n",
       "10       景点 服务 不错 ， 就是 排队   太长 了 ， 好玩 的 项目 都 是 人 ， 晚上 的...\n",
       "11                                               绍兴 护城河 夜游\n",
       "12                              感觉 还 不错 ， 作为 一日游 不错 的 选择 ～\n",
       "13                                           有趣 hai   xing\n",
       "14       荡气回肠 ， 10 年 去 的 ， 居然 没有 留下来 照片 ， 必然 要 再 去 ！ < ...\n",
       "15       景色 超级 棒 ， 有 美丽 的 故事 ， 可以 乘船 游览 ， 也 可以 沿湖 浏览 ， ...\n",
       "16       南锣鼓巷 是 北京市 中心 一条 老 胡同 ， 因为 其 地理位置 靠近 什刹海 ， 成为 ...\n",
       "17           个人感觉 就是 个 卖 小商品 的 地方 ， 还 不 便宜 ， 但是 晚上 夜景 挺 好看\n",
       "18                                                  性价比 超高\n",
       "19            挺 普通 的 吧 ， 就 在 楼下 拍 了 几张 图片 ， 反正 也 是 进不去 的 呵\n",
       "20       太大 了 ， 在 里面 走 了 好长时间 也 就 看 了 不到 五分之一 。 但 周围 交通...\n",
       "21                                                     迪士尼\n",
       "22                                                    亲子 游\n",
       "23       来 苏州 总是 要 欣赏 一下 古典 园林 的 。 可惜 对 园林 不太 感冒 。 逛逛 玩...\n",
       "24        不到长城非好汉 ， 对于 爬 过 华山 的 我 来说 ， 长城 太 简单 了 ， 值得 一去 。\n",
       "25                                             很 值得 去 的 地方\n",
       "26                                               第一次 必到 景点\n",
       "27                                               好歹 也 是 长城\n",
       "28       早上 一大早 就 起床 去 看 升国旗 ， 很 庄严 ， 很 整齐 ， 就是 像 一个 节目...\n",
       "29                      登顶 是 俯瞰 故宫 的 绝佳 之 处 ， 崇祯帝 在 此 自缢 ！\n",
       "                               ...                        \n",
       "29970    景色 很 美 ， 原谅 我 的 语言 苍白 ， 池水 很清 ， 可能 是 折射 也 可能 是...\n",
       "29971                               人 好多 ， 人 好多 ， 人 好多 呀 ！\n",
       "29972    　 　 1 . 西塘 管 老太 臭豆腐 ， 这个 是 西塘 最 著名 的 小吃 了 ， 山寨...\n",
       "29973    798 艺术 中心   来 朝圣 了   到处 都 是 拍照 的 人 ， 到处 都 是 奇装...\n",
       "29974                  心诚则灵 ， 很多 拜佛 的 地方 都 是 有 信仰 的 人才 去 的\n",
       "29975    十一月 初来 的 ， 风 很大 但是 不 太冷 ， 坐 滑车 来回 ， 也 不错 ， 长城 ...\n",
       "29976                               总体 就是 一般般 吧 ， 不会 特别 推荐\n",
       "29977                                   大大的 雪场 ， 绚丽多彩 的 冰灯\n",
       "29978                          这 是 个 代表性 的 建筑 ， 代表性 的 地点 。\n",
       "29979      坐地铁 直奔 王府井 ， 因为 急 找 住 的 地方 把 背包 放下 ， 就 没 溜达 王府井\n",
       "29980                      西双版纳 热带 植物园 挺大 的 ， 这里 很多 热带植物 ！\n",
       "29981    从 上海 到 西塘 坐车 很 方便 ， 票价 也 不贵 。 冬天 的 西塘 虽然 冷 ， 但...\n",
       "29982    观音菩萨 的 道场 ， 从来 都 没有 遭遇 过 台风 的 平静 之地 。 岛 还是 比较 ...\n",
       "29983    西湖 美在 有 秀丽 的 自然风光 ， 动人 的 传说 ， 湖光山色 塔影 相映成趣 ， 连...\n",
       "29984                                             一点 都 不好玩\n",
       "29985                                                 值得一看\n",
       "29986    【 点评 有奖 第 14 季 】 青城山 风景 很 美 ， 寺庙 建设 很 特别 ， 但 景...\n",
       "29987                            人山人海 ， 太多人 了 ， 特别 是 跟 团 的\n",
       "29988                               预订 票 在 机器 上取 ， 快捷 方便 。\n",
       "29989         烟花 三月 下 扬州 ， 到 了 瘦西湖 才 算是 到 了 扬州 ， 算是 必 选项 。\n",
       "29990    秋日 的 下午 来到 这里 ， 游人 虽然 不少 ， 但是 也 不算 特别 多 ， 慢慢 走...\n",
       "29991    新晋 5A 级 国家 旅游 风景区 ， 渤海 广场 乘 8 路 公交车 终点 即 是 （ 票...\n",
       "29992              很 不错 ， 自己 爬上去 的 ， 走走停停 ， 认识 了 新 的 小伙伴 。\n",
       "29993                                  有何 功德 烧香 的 人 是 人山人海\n",
       "29994                               还 不错 ， 不过 商业化 比较严重 啦 ，\n",
       "29995    据说 这个 是 后来 建 的 , 没有 什么 历史 意义 , 登上 4 楼 就 累 的 不行...\n",
       "29996    颐和园 也 是 我 喜欢 的 北京 经典 景点 之一 ， 它 长长的 回廊 ， 万寿山 ， ...\n",
       "29997                             比较 小众 的 景点 ， 但是 真心 很 美 。\n",
       "29998    雨天 走走 ， 景色宜人 。 可惜 体力 有限 ， 走 了 一个 小时 不到 就 回去 了 ...\n",
       "29999    160607 - 08   两日   < br   / > n 路线 : < br   / ...\n",
       "Name: comments, Length: 130000, dtype: object"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['comments']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    " tv = TfidfVectorizer(max_features=50000)\n",
    " X_comments = tv.fit_transform(data.comments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<130000x50000 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 1734791 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_comments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_t = X_comments[:train.shape[0]]\n",
    "X_te = X_comments[train.shape[0]:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30000, 50000)"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_te.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = train.Score.values\n",
    "x_train, x_val, y_train, y_val = train_test_split(X_t, y, test_size = 0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 50000)"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_val.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lgb_train = lgb.Dataset(x_train, y_train)\n",
    "lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params = {\n",
    "    'boosting_type': 'gbdt',\n",
    "    'objective': 'multiclass',\n",
    "    'num_class' : 6,\n",
    "    'metric': 'multi_error',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1]\tvalid_0's multi_error: 0.3949\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "[2]\tvalid_0's multi_error: 0.3941\n",
      "[3]\tvalid_0's multi_error: 0.3937\n",
      "[4]\tvalid_0's multi_error: 0.3953\n",
      "[5]\tvalid_0's multi_error: 0.3949\n",
      "[6]\tvalid_0's multi_error: 0.3945\n",
      "[7]\tvalid_0's multi_error: 0.3943\n",
      "[8]\tvalid_0's multi_error: 0.3951\n",
      "[9]\tvalid_0's multi_error: 0.3944\n",
      "[10]\tvalid_0's multi_error: 0.3946\n",
      "[11]\tvalid_0's multi_error: 0.3942\n",
      "[12]\tvalid_0's multi_error: 0.3936\n",
      "[13]\tvalid_0's multi_error: 0.3936\n",
      "[14]\tvalid_0's multi_error: 0.3947\n",
      "[15]\tvalid_0's multi_error: 0.3942\n",
      "[16]\tvalid_0's multi_error: 0.3935\n",
      "[17]\tvalid_0's multi_error: 0.3931\n",
      "[18]\tvalid_0's multi_error: 0.3923\n",
      "[19]\tvalid_0's multi_error: 0.3929\n",
      "[20]\tvalid_0's multi_error: 0.392\n",
      "[21]\tvalid_0's multi_error: 0.391\n",
      "[22]\tvalid_0's multi_error: 0.391\n",
      "[23]\tvalid_0's multi_error: 0.3903\n",
      "[24]\tvalid_0's multi_error: 0.3905\n",
      "[25]\tvalid_0's multi_error: 0.3896\n",
      "[26]\tvalid_0's multi_error: 0.3894\n",
      "[27]\tvalid_0's multi_error: 0.3891\n",
      "[28]\tvalid_0's multi_error: 0.3882\n",
      "[29]\tvalid_0's multi_error: 0.3878\n",
      "[30]\tvalid_0's multi_error: 0.3867\n",
      "[31]\tvalid_0's multi_error: 0.3868\n",
      "[32]\tvalid_0's multi_error: 0.3866\n",
      "[33]\tvalid_0's multi_error: 0.3858\n",
      "[34]\tvalid_0's multi_error: 0.385\n",
      "[35]\tvalid_0's multi_error: 0.3856\n",
      "[36]\tvalid_0's multi_error: 0.3848\n",
      "[37]\tvalid_0's multi_error: 0.3849\n",
      "[38]\tvalid_0's multi_error: 0.3842\n",
      "[39]\tvalid_0's multi_error: 0.3839\n",
      "[40]\tvalid_0's multi_error: 0.3838\n",
      "[41]\tvalid_0's multi_error: 0.3836\n",
      "[42]\tvalid_0's multi_error: 0.382\n",
      "[43]\tvalid_0's multi_error: 0.3803\n",
      "[44]\tvalid_0's multi_error: 0.3801\n",
      "[45]\tvalid_0's multi_error: 0.3799\n",
      "[46]\tvalid_0's multi_error: 0.3798\n",
      "[47]\tvalid_0's multi_error: 0.3794\n",
      "[48]\tvalid_0's multi_error: 0.3794\n",
      "[49]\tvalid_0's multi_error: 0.3787\n",
      "[50]\tvalid_0's multi_error: 0.3788\n",
      "[51]\tvalid_0's multi_error: 0.3781\n",
      "[52]\tvalid_0's multi_error: 0.3785\n",
      "[53]\tvalid_0's multi_error: 0.3785\n",
      "[54]\tvalid_0's multi_error: 0.3784\n",
      "[55]\tvalid_0's multi_error: 0.3781\n",
      "[56]\tvalid_0's multi_error: 0.3775\n",
      "[57]\tvalid_0's multi_error: 0.3774\n",
      "[58]\tvalid_0's multi_error: 0.3769\n",
      "[59]\tvalid_0's multi_error: 0.3767\n",
      "[60]\tvalid_0's multi_error: 0.3767\n",
      "[61]\tvalid_0's multi_error: 0.3764\n",
      "[62]\tvalid_0's multi_error: 0.3762\n",
      "[63]\tvalid_0's multi_error: 0.3761\n",
      "[64]\tvalid_0's multi_error: 0.3759\n",
      "[65]\tvalid_0's multi_error: 0.3755\n",
      "[66]\tvalid_0's multi_error: 0.376\n",
      "[67]\tvalid_0's multi_error: 0.3758\n",
      "[68]\tvalid_0's multi_error: 0.3755\n",
      "[69]\tvalid_0's multi_error: 0.3757\n",
      "[70]\tvalid_0's multi_error: 0.3755\n",
      "[71]\tvalid_0's multi_error: 0.3759\n",
      "[72]\tvalid_0's multi_error: 0.3753\n",
      "[73]\tvalid_0's multi_error: 0.3749\n",
      "[74]\tvalid_0's multi_error: 0.3745\n",
      "[75]\tvalid_0's multi_error: 0.3739\n",
      "[76]\tvalid_0's multi_error: 0.3741\n",
      "[77]\tvalid_0's multi_error: 0.3737\n",
      "[78]\tvalid_0's multi_error: 0.3735\n",
      "[79]\tvalid_0's multi_error: 0.3733\n",
      "[80]\tvalid_0's multi_error: 0.374\n",
      "[81]\tvalid_0's multi_error: 0.3739\n",
      "[82]\tvalid_0's multi_error: 0.3742\n",
      "[83]\tvalid_0's multi_error: 0.3743\n",
      "[84]\tvalid_0's multi_error: 0.3746\n",
      "[85]\tvalid_0's multi_error: 0.3745\n",
      "[86]\tvalid_0's multi_error: 0.3748\n",
      "[87]\tvalid_0's multi_error: 0.3743\n",
      "[88]\tvalid_0's multi_error: 0.3748\n",
      "[89]\tvalid_0's multi_error: 0.3745\n",
      "[90]\tvalid_0's multi_error: 0.3745\n",
      "[91]\tvalid_0's multi_error: 0.3744\n",
      "[92]\tvalid_0's multi_error: 0.3744\n",
      "[93]\tvalid_0's multi_error: 0.3745\n",
      "[94]\tvalid_0's multi_error: 0.374\n",
      "[95]\tvalid_0's multi_error: 0.3737\n",
      "[96]\tvalid_0's multi_error: 0.3743\n",
      "[97]\tvalid_0's multi_error: 0.3741\n",
      "[98]\tvalid_0's multi_error: 0.3738\n",
      "[99]\tvalid_0's multi_error: 0.3743\n",
      "[100]\tvalid_0's multi_error: 0.3741\n",
      "[101]\tvalid_0's multi_error: 0.374\n",
      "[102]\tvalid_0's multi_error: 0.3733\n",
      "[103]\tvalid_0's multi_error: 0.3734\n",
      "[104]\tvalid_0's multi_error: 0.3729\n",
      "[105]\tvalid_0's multi_error: 0.3726\n",
      "[106]\tvalid_0's multi_error: 0.3728\n",
      "[107]\tvalid_0's multi_error: 0.3723\n",
      "[108]\tvalid_0's multi_error: 0.3723\n",
      "[109]\tvalid_0's multi_error: 0.3724\n",
      "[110]\tvalid_0's multi_error: 0.3723\n",
      "[111]\tvalid_0's multi_error: 0.3727\n",
      "[112]\tvalid_0's multi_error: 0.3725\n",
      "[113]\tvalid_0's multi_error: 0.3723\n",
      "[114]\tvalid_0's multi_error: 0.3729\n",
      "[115]\tvalid_0's multi_error: 0.373\n",
      "[116]\tvalid_0's multi_error: 0.3732\n",
      "[117]\tvalid_0's multi_error: 0.3724\n",
      "[118]\tvalid_0's multi_error: 0.3723\n",
      "[119]\tvalid_0's multi_error: 0.3728\n",
      "[120]\tvalid_0's multi_error: 0.3723\n",
      "[121]\tvalid_0's multi_error: 0.372\n",
      "[122]\tvalid_0's multi_error: 0.3718\n",
      "[123]\tvalid_0's multi_error: 0.3719\n",
      "[124]\tvalid_0's multi_error: 0.3718\n",
      "[125]\tvalid_0's multi_error: 0.3719\n",
      "[126]\tvalid_0's multi_error: 0.3716\n",
      "[127]\tvalid_0's multi_error: 0.3717\n",
      "[128]\tvalid_0's multi_error: 0.3716\n",
      "[129]\tvalid_0's multi_error: 0.3711\n",
      "[130]\tvalid_0's multi_error: 0.3711\n",
      "[131]\tvalid_0's multi_error: 0.3713\n",
      "[132]\tvalid_0's multi_error: 0.3711\n",
      "[133]\tvalid_0's multi_error: 0.3711\n",
      "[134]\tvalid_0's multi_error: 0.3711\n",
      "[135]\tvalid_0's multi_error: 0.3708\n",
      "[136]\tvalid_0's multi_error: 0.371\n",
      "[137]\tvalid_0's multi_error: 0.3711\n",
      "[138]\tvalid_0's multi_error: 0.3711\n",
      "[139]\tvalid_0's multi_error: 0.3714\n",
      "[140]\tvalid_0's multi_error: 0.3716\n",
      "[141]\tvalid_0's multi_error: 0.3715\n",
      "[142]\tvalid_0's multi_error: 0.3716\n",
      "[143]\tvalid_0's multi_error: 0.3707\n",
      "[144]\tvalid_0's multi_error: 0.3709\n",
      "[145]\tvalid_0's multi_error: 0.3709\n",
      "[146]\tvalid_0's multi_error: 0.3706\n",
      "[147]\tvalid_0's multi_error: 0.3704\n",
      "[148]\tvalid_0's multi_error: 0.3704\n",
      "[149]\tvalid_0's multi_error: 0.3704\n",
      "[150]\tvalid_0's multi_error: 0.3703\n",
      "[151]\tvalid_0's multi_error: 0.3706\n",
      "[152]\tvalid_0's multi_error: 0.3701\n",
      "[153]\tvalid_0's multi_error: 0.3701\n",
      "[154]\tvalid_0's multi_error: 0.3702\n",
      "[155]\tvalid_0's multi_error: 0.3703\n",
      "[156]\tvalid_0's multi_error: 0.37\n",
      "[157]\tvalid_0's multi_error: 0.3697\n",
      "[158]\tvalid_0's multi_error: 0.3695\n",
      "[159]\tvalid_0's multi_error: 0.3699\n",
      "[160]\tvalid_0's multi_error: 0.3698\n",
      "[161]\tvalid_0's multi_error: 0.3694\n",
      "[162]\tvalid_0's multi_error: 0.3695\n",
      "[163]\tvalid_0's multi_error: 0.3692\n",
      "[164]\tvalid_0's multi_error: 0.3691\n",
      "[165]\tvalid_0's multi_error: 0.3691\n",
      "[166]\tvalid_0's multi_error: 0.369\n",
      "[167]\tvalid_0's multi_error: 0.3687\n",
      "[168]\tvalid_0's multi_error: 0.3686\n",
      "[169]\tvalid_0's multi_error: 0.368\n",
      "[170]\tvalid_0's multi_error: 0.3677\n",
      "[171]\tvalid_0's multi_error: 0.3676\n",
      "[172]\tvalid_0's multi_error: 0.3678\n",
      "[173]\tvalid_0's multi_error: 0.368\n",
      "[174]\tvalid_0's multi_error: 0.3682\n",
      "[175]\tvalid_0's multi_error: 0.3684\n",
      "[176]\tvalid_0's multi_error: 0.3684\n",
      "[177]\tvalid_0's multi_error: 0.3681\n",
      "[178]\tvalid_0's multi_error: 0.3677\n",
      "[179]\tvalid_0's multi_error: 0.3676\n",
      "[180]\tvalid_0's multi_error: 0.3679\n",
      "[181]\tvalid_0's multi_error: 0.368\n",
      "[182]\tvalid_0's multi_error: 0.3684\n",
      "[183]\tvalid_0's multi_error: 0.3685\n",
      "[184]\tvalid_0's multi_error: 0.3686\n",
      "[185]\tvalid_0's multi_error: 0.3685\n",
      "[186]\tvalid_0's multi_error: 0.3686\n",
      "[187]\tvalid_0's multi_error: 0.3686\n",
      "[188]\tvalid_0's multi_error: 0.3684\n",
      "[189]\tvalid_0's multi_error: 0.3687\n",
      "[190]\tvalid_0's multi_error: 0.3684\n",
      "[191]\tvalid_0's multi_error: 0.3685\n",
      "[192]\tvalid_0's multi_error: 0.3684\n",
      "[193]\tvalid_0's multi_error: 0.3679\n",
      "[194]\tvalid_0's multi_error: 0.3682\n",
      "[195]\tvalid_0's multi_error: 0.3682\n",
      "[196]\tvalid_0's multi_error: 0.3679\n",
      "[197]\tvalid_0's multi_error: 0.368\n",
      "[198]\tvalid_0's multi_error: 0.3679\n",
      "[199]\tvalid_0's multi_error: 0.3678\n",
      "[200]\tvalid_0's multi_error: 0.3679\n",
      "[201]\tvalid_0's multi_error: 0.368\n",
      "[202]\tvalid_0's multi_error: 0.3677\n",
      "[203]\tvalid_0's multi_error: 0.3679\n",
      "[204]\tvalid_0's multi_error: 0.3677\n",
      "[205]\tvalid_0's multi_error: 0.3682\n",
      "[206]\tvalid_0's multi_error: 0.3684\n",
      "[207]\tvalid_0's multi_error: 0.3682\n",
      "[208]\tvalid_0's multi_error: 0.3683\n",
      "[209]\tvalid_0's multi_error: 0.3683\n",
      "[210]\tvalid_0's multi_error: 0.3685\n",
      "[211]\tvalid_0's multi_error: 0.3685\n",
      "[212]\tvalid_0's multi_error: 0.3682\n",
      "[213]\tvalid_0's multi_error: 0.3682\n",
      "[214]\tvalid_0's multi_error: 0.3683\n",
      "[215]\tvalid_0's multi_error: 0.3682\n",
      "[216]\tvalid_0's multi_error: 0.3687\n",
      "[217]\tvalid_0's multi_error: 0.3685\n",
      "[218]\tvalid_0's multi_error: 0.3685\n",
      "[219]\tvalid_0's multi_error: 0.3688\n",
      "[220]\tvalid_0's multi_error: 0.3686\n",
      "[221]\tvalid_0's multi_error: 0.3685\n",
      "Early stopping, best iteration is:\n",
      "[171]\tvalid_0's multi_error: 0.3676\n"
     ]
    }
   ],
   "source": [
    "gbm = lgb.train(params,\n",
    "                lgb_train,\n",
    "                num_boost_round=3000,\n",
    "                valid_sets=lgb_eval,\n",
    "                early_stopping_rounds=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5, 5, 5, ..., 4, 5, 4])"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start predicting...\n"
     ]
    }
   ],
   "source": [
    "print('Start predicting...')\n",
    "# predict\n",
    "y_pred = gbm.predict(X_te, num_iteration=gbm.best_iteration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  1.02074999e-16,   2.16257277e-03,   3.71120457e-03,\n",
       "          1.31568605e-02,   1.65827920e-01,   8.15141442e-01],\n",
       "       [  1.04247014e-16,   1.01018201e-03,   2.99684811e-03,\n",
       "          1.93025844e-02,   1.66311175e-01,   8.10379211e-01],\n",
       "       [  1.06017181e-16,   1.17873314e-03,   7.03933393e-03,\n",
       "          1.03584372e-01,   3.54675617e-01,   5.33521944e-01],\n",
       "       ..., \n",
       "       [  1.87779534e-16,   1.21402048e-03,   2.99462243e-02,\n",
       "          2.15112306e-01,   2.76485860e-01,   4.77241589e-01],\n",
       "       [  1.37598314e-16,   7.47458132e-03,   4.32631506e-03,\n",
       "          1.20482904e-01,   2.17702535e-01,   6.50013665e-01],\n",
       "       [  1.53357707e-16,   3.31472179e-03,   4.65003534e-03,\n",
       "          1.60749787e-01,   2.76161699e-01,   5.55123757e-01]])"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = pd.DataFrame(y_pred)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_1 = y_pred[0]*1 + y_pred[1]*1 + y_pred[2]*2+y_pred[3]*3+y_pred[4]*4 +y_pred[5]*5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_1[y_1>4.7] = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub = pd.DataFrame()\n",
    "sub['id'] = pd.DataFrame(test[\"Id\"])\n",
    "sub['Score'] = pd.DataFrame(y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9a1caf96-681e-3c11-b588-43ac742d7fd2</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>82b450db-65c2-351c-84fb-761d76582680</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2eec4606-590c-3fa2-b846-7f92441c54a6</td>\n",
       "      <td>4.412323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>509f9a68-ac41-35ff-9d2e-2fc12f73ed7f</td>\n",
       "      <td>4.536333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>395f4b22-1c5f-328a-a19d-5065e0530cbc</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6f1202b2-0c1a-3d79-a675-453aad931e3e</td>\n",
       "      <td>4.509231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>b4bc951f-ce1e-32ae-b27c-bbecba1fd0ee</td>\n",
       "      <td>4.552376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>30369496-e4b1-30b9-b427-fe23fd1712d6</td>\n",
       "      <td>3.743906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>704a36e8-47b1-3b93-bb6a-43610c81e1a0</td>\n",
       "      <td>4.515514</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8a5711e2-b855-3088-adcc-a4d634e2cbb9</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>13f31c50-3e46-3aca-a5eb-3fd619bfac61</td>\n",
       "      <td>4.227584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>cb387563-37ed-3058-bc82-726365529c5e</td>\n",
       "      <td>4.610097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>932ef7c5-15b7-3224-ab7b-1904ea851359</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>01523456-6960-3674-8e8a-0e7033ad73e3</td>\n",
       "      <td>4.415779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>cebd3ff7-b08c-381d-8f7e-542166357ae2</td>\n",
       "      <td>4.567919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>178ee4e7-6af7-3402-9844-a49b55b71ce1</td>\n",
       "      <td>4.225445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2db8c129-c41e-3efa-9685-c5924561f292</td>\n",
       "      <td>4.303909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>fcb6cd9d-202c-3a52-8673-46cc685b3278</td>\n",
       "      <td>4.267765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>199e2bf3-29b2-37f3-a467-0cabbc78517a</td>\n",
       "      <td>4.511175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>e9458490-b4e7-38a0-90bf-1c20c2f9a917</td>\n",
       "      <td>4.237361</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>90554e3c-fc13-363b-9ea9-22e1e3c9ec47</td>\n",
       "      <td>3.817186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>73852def-c6f2-382a-88a8-777b35a45b89</td>\n",
       "      <td>4.421768</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>2989fa30-73ec-3d78-abcc-38db26eced1c</td>\n",
       "      <td>4.197192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>ecbe4aed-6385-31e7-a96b-9e406182e1c8</td>\n",
       "      <td>4.293270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>4b03c811-7c26-3111-99d8-af1bc021dfac</td>\n",
       "      <td>4.115327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>b154b63f-cd47-3332-bba5-560ddb40912b</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>8bd29df6-5095-32f2-bc2e-2ae0ceb3bdb6</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>47637f84-cec0-38af-b20a-7bb22e902f40</td>\n",
       "      <td>4.465056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>ea1ab186-6c98-389f-bd18-471cdb46d907</td>\n",
       "      <td>4.629731</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>c3c79660-99f9-39ac-949d-088d6ce80d1e</td>\n",
       "      <td>4.545177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29970</th>\n",
       "      <td>f0de95d2-ef2b-3361-a6df-3f4392d74e93</td>\n",
       "      <td>4.464813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29971</th>\n",
       "      <td>d0a33ded-fed9-3d92-9c51-c6512d71a93d</td>\n",
       "      <td>4.548973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29972</th>\n",
       "      <td>b00cd736-093a-3d1e-b750-41283e37e09b</td>\n",
       "      <td>4.259932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29973</th>\n",
       "      <td>8be4f87a-c76b-3e65-8ecc-a25f209824da</td>\n",
       "      <td>4.283491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29974</th>\n",
       "      <td>9079f642-7458-3c6d-9098-c4877cc81347</td>\n",
       "      <td>4.545142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29975</th>\n",
       "      <td>f0d47b46-9391-3d0c-b068-8753e4edd583</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29976</th>\n",
       "      <td>8e9afa16-33ae-3f51-8156-1e9e5337605c</td>\n",
       "      <td>3.237102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29977</th>\n",
       "      <td>3a19d80a-1cdf-3616-9707-6f8a72b4513b</td>\n",
       "      <td>4.580044</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29978</th>\n",
       "      <td>73081902-03a0-3f36-b562-900d6fd81f6c</td>\n",
       "      <td>4.498993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29979</th>\n",
       "      <td>7fed708a-e036-36cb-98a4-f8de9fbbf3db</td>\n",
       "      <td>4.383259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29980</th>\n",
       "      <td>cdb9eed1-54fb-3454-b98d-645c3bdfca78</td>\n",
       "      <td>4.524165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29981</th>\n",
       "      <td>0986165f-e9a5-394c-91d2-57149a4cad4b</td>\n",
       "      <td>4.511817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29982</th>\n",
       "      <td>99aeecae-9a3b-337c-ad7b-ab82bbb499b4</td>\n",
       "      <td>4.536403</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29983</th>\n",
       "      <td>c0334558-d138-31cb-b714-0b6a320fa903</td>\n",
       "      <td>4.556677</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29984</th>\n",
       "      <td>03f30d5a-9478-3701-b8e0-dec924ffe9a5</td>\n",
       "      <td>2.937353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29985</th>\n",
       "      <td>a8c15edb-40a9-3014-a021-83d0d747f2d9</td>\n",
       "      <td>4.585175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29986</th>\n",
       "      <td>856aefdd-dc4c-36b2-a102-d9333b44186c</td>\n",
       "      <td>4.588905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29987</th>\n",
       "      <td>179cf47c-4eb3-3c94-b097-52ab1844d67b</td>\n",
       "      <td>4.517165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29988</th>\n",
       "      <td>c5e81b73-0412-3120-a94e-c14f3dc942ab</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29989</th>\n",
       "      <td>803661ce-e446-32c6-a3f2-707e7d53891d</td>\n",
       "      <td>4.413288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29990</th>\n",
       "      <td>0c972f48-a836-3bdc-8dd5-fc4ac7d3385e</td>\n",
       "      <td>4.476636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29991</th>\n",
       "      <td>927ed129-56ea-33fa-bcd3-f49527693fbb</td>\n",
       "      <td>4.248025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29992</th>\n",
       "      <td>cda5d912-3bf9-382a-9b3a-5c08bc665360</td>\n",
       "      <td>4.608218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29993</th>\n",
       "      <td>035184d7-2f13-32d2-8479-77532cdf6152</td>\n",
       "      <td>4.423604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29994</th>\n",
       "      <td>275498bc-12b3-3086-ab78-b7d0ea0da2e9</td>\n",
       "      <td>4.174299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29995</th>\n",
       "      <td>b4f946fe-0deb-3f7a-a8ae-ff1446818ec0</td>\n",
       "      <td>3.689994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29996</th>\n",
       "      <td>f030f2ed-1a39-32ff-a2dd-14fb757b2cb2</td>\n",
       "      <td>4.587964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29997</th>\n",
       "      <td>647597a2-ec49-3f27-8924-363ddef52ca0</td>\n",
       "      <td>4.198595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29998</th>\n",
       "      <td>1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37</td>\n",
       "      <td>4.498454</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29999</th>\n",
       "      <td>b092aa49-4688-3c41-8aec-43c03186567f</td>\n",
       "      <td>4.375130</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>30000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         id     Score\n",
       "0      9a1caf96-681e-3c11-b588-43ac742d7fd2  5.000000\n",
       "1      82b450db-65c2-351c-84fb-761d76582680  5.000000\n",
       "2      2eec4606-590c-3fa2-b846-7f92441c54a6  4.412323\n",
       "3      509f9a68-ac41-35ff-9d2e-2fc12f73ed7f  4.536333\n",
       "4      395f4b22-1c5f-328a-a19d-5065e0530cbc  5.000000\n",
       "5      6f1202b2-0c1a-3d79-a675-453aad931e3e  4.509231\n",
       "6      b4bc951f-ce1e-32ae-b27c-bbecba1fd0ee  4.552376\n",
       "7      30369496-e4b1-30b9-b427-fe23fd1712d6  3.743906\n",
       "8      704a36e8-47b1-3b93-bb6a-43610c81e1a0  4.515514\n",
       "9      8a5711e2-b855-3088-adcc-a4d634e2cbb9  5.000000\n",
       "10     13f31c50-3e46-3aca-a5eb-3fd619bfac61  4.227584\n",
       "11     cb387563-37ed-3058-bc82-726365529c5e  4.610097\n",
       "12     932ef7c5-15b7-3224-ab7b-1904ea851359  5.000000\n",
       "13     01523456-6960-3674-8e8a-0e7033ad73e3  4.415779\n",
       "14     cebd3ff7-b08c-381d-8f7e-542166357ae2  4.567919\n",
       "15     178ee4e7-6af7-3402-9844-a49b55b71ce1  4.225445\n",
       "16     2db8c129-c41e-3efa-9685-c5924561f292  4.303909\n",
       "17     fcb6cd9d-202c-3a52-8673-46cc685b3278  4.267765\n",
       "18     199e2bf3-29b2-37f3-a467-0cabbc78517a  4.511175\n",
       "19     e9458490-b4e7-38a0-90bf-1c20c2f9a917  4.237361\n",
       "20     90554e3c-fc13-363b-9ea9-22e1e3c9ec47  3.817186\n",
       "21     73852def-c6f2-382a-88a8-777b35a45b89  4.421768\n",
       "22     2989fa30-73ec-3d78-abcc-38db26eced1c  4.197192\n",
       "23     ecbe4aed-6385-31e7-a96b-9e406182e1c8  4.293270\n",
       "24     4b03c811-7c26-3111-99d8-af1bc021dfac  4.115327\n",
       "25     b154b63f-cd47-3332-bba5-560ddb40912b  5.000000\n",
       "26     8bd29df6-5095-32f2-bc2e-2ae0ceb3bdb6  5.000000\n",
       "27     47637f84-cec0-38af-b20a-7bb22e902f40  4.465056\n",
       "28     ea1ab186-6c98-389f-bd18-471cdb46d907  4.629731\n",
       "29     c3c79660-99f9-39ac-949d-088d6ce80d1e  4.545177\n",
       "...                                     ...       ...\n",
       "29970  f0de95d2-ef2b-3361-a6df-3f4392d74e93  4.464813\n",
       "29971  d0a33ded-fed9-3d92-9c51-c6512d71a93d  4.548973\n",
       "29972  b00cd736-093a-3d1e-b750-41283e37e09b  4.259932\n",
       "29973  8be4f87a-c76b-3e65-8ecc-a25f209824da  4.283491\n",
       "29974  9079f642-7458-3c6d-9098-c4877cc81347  4.545142\n",
       "29975  f0d47b46-9391-3d0c-b068-8753e4edd583  5.000000\n",
       "29976  8e9afa16-33ae-3f51-8156-1e9e5337605c  3.237102\n",
       "29977  3a19d80a-1cdf-3616-9707-6f8a72b4513b  4.580044\n",
       "29978  73081902-03a0-3f36-b562-900d6fd81f6c  4.498993\n",
       "29979  7fed708a-e036-36cb-98a4-f8de9fbbf3db  4.383259\n",
       "29980  cdb9eed1-54fb-3454-b98d-645c3bdfca78  4.524165\n",
       "29981  0986165f-e9a5-394c-91d2-57149a4cad4b  4.511817\n",
       "29982  99aeecae-9a3b-337c-ad7b-ab82bbb499b4  4.536403\n",
       "29983  c0334558-d138-31cb-b714-0b6a320fa903  4.556677\n",
       "29984  03f30d5a-9478-3701-b8e0-dec924ffe9a5  2.937353\n",
       "29985  a8c15edb-40a9-3014-a021-83d0d747f2d9  4.585175\n",
       "29986  856aefdd-dc4c-36b2-a102-d9333b44186c  4.588905\n",
       "29987  179cf47c-4eb3-3c94-b097-52ab1844d67b  4.517165\n",
       "29988  c5e81b73-0412-3120-a94e-c14f3dc942ab  5.000000\n",
       "29989  803661ce-e446-32c6-a3f2-707e7d53891d  4.413288\n",
       "29990  0c972f48-a836-3bdc-8dd5-fc4ac7d3385e  4.476636\n",
       "29991  927ed129-56ea-33fa-bcd3-f49527693fbb  4.248025\n",
       "29992  cda5d912-3bf9-382a-9b3a-5c08bc665360  4.608218\n",
       "29993  035184d7-2f13-32d2-8479-77532cdf6152  4.423604\n",
       "29994  275498bc-12b3-3086-ab78-b7d0ea0da2e9  4.174299\n",
       "29995  b4f946fe-0deb-3f7a-a8ae-ff1446818ec0  3.689994\n",
       "29996  f030f2ed-1a39-32ff-a2dd-14fb757b2cb2  4.587964\n",
       "29997  647597a2-ec49-3f27-8924-363ddef52ca0  4.198595\n",
       "29998  1cbc2b1b-c7a6-34d6-93ae-a7da9cd93d37  4.498454\n",
       "29999  b092aa49-4688-3c41-8aec-43c03186567f  4.375130\n",
       "\n",
       "[30000 rows x 2 columns]"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub.to_csv('tf_idf.csv',index=False,header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
